COVID-19 Data Visualization
library( "ggpmisc")
## Warning: pakiet 'ggpmisc' został zbudowany w wersji R 4.3.2
## Ładowanie wymaganego pakietu: ggpp
## Warning: pakiet 'ggpp' został zbudowany w wersji R 4.3.2
## Ładowanie wymaganego pakietu: ggplot2
## Warning: pakiet 'ggplot2' został zbudowany w wersji R 4.3.2
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
##
## Dołączanie pakietu: 'ggpp'
## Następujący obiekt został zakryty z 'package:ggplot2':
##
## annotate
## Registered S3 method overwritten by 'ggpmisc':
## method from
## as.character.polynomial polynom
library("ggplot2")
library("plotly")
## Warning: pakiet 'plotly' został zbudowany w wersji R 4.3.2
##
## Dołączanie pakietu: 'plotly'
## Następujący obiekt został zakryty z 'package:ggplot2':
##
## last_plot
## Następujący obiekt został zakryty z 'package:stats':
##
## filter
## Następujący obiekt został zakryty z 'package:graphics':
##
## layout
library(readr)
## Warning: pakiet 'readr' został zbudowany w wersji R 4.3.2
# I am using total_cases.csv dataset. This dataset contains only two columns: date and cases.
total_cases <- read_csv("C:/Users/Martynaa/Desktop/portfolio/analizy_R/total_cases.csv")
## Rows: 64 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): cases
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(total_cases)
## # A tibble: 6 × 2
## date cases
## <date> <dbl>
## 1 2020-11-21 656305
## 2 2020-11-22 607893
## 3 2020-11-23 547681
## 4 2020-11-24 463730
## 5 2020-11-25 603506
## 6 2020-11-26 589316
# Structure:
str(total_cases)
## spc_tbl_ [64 × 2] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ date : Date[1:64], format: "2020-11-21" "2020-11-22" ...
## $ cases: num [1:64] 656305 607893 547681 463730 603506 ...
## - attr(*, "spec")=
## .. cols(
## .. date = col_date(format = ""),
## .. cases = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Summary:
summary(total_cases)
## date cases
## Min. :2020-11-21 Min. :312202
## 1st Qu.:2020-12-06 1st Qu.:570826
## Median :2020-12-22 Median :628794
## Mean :2020-12-22 Mean :629472
## 3rd Qu.:2021-01-07 3rd Qu.:694328
## Max. :2021-01-23 Max. :898893
# Dimensions:
dim(total_cases)
## [1] 64 2
# Interactive chart
first_chart <- ggplot(total_cases, aes(x = date, y = cases)) +
geom_line(color = "blue") +
labs(title = "Number of COVID-19 cases",
x = "Date",
y = "Sum: confirmed cases")
# conversion:
ggplotly(first_chart)
# Histogram and density plot
ggplot(total_cases, aes(x=cases)) + geom_histogram(aes(y=..density..), color="black", fill="white") +
geom_density(color="lightpink", fill="lightpink", alpha=0.4) +
theme(plot.background = element_rect("white"), panel.background = element_rect("white"), axis.line = element_line("black"), panel.grid.major = element_line(colour = "grey50") ) +
labs(title = "Histogram and density plot")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# This dataset presents 7 countries and total cases each day
top7<-read.csv("C:/Users/Martynaa/Desktop/portfolio/analizy_R/top7_02_2020.csv")
head(top7)
## country date cum_cases
## 1 Germany 2020-02-18 16
## 2 Iran 2020-02-18 0
## 3 Italy 2020-02-18 3
## 4 Korea, South 2020-02-18 31
## 5 Spain 2020-02-18 2
## 6 US 2020-02-18 13
# Structure:
str(top7)
## 'data.frame': 2030 obs. of 3 variables:
## $ country : chr "Germany" "Iran" "Italy" "Korea, South" ...
## $ date : chr "2020-02-18" "2020-02-18" "2020-02-18" "2020-02-18" ...
## $ cum_cases: int 16 0 3 31 2 13 13 13 13 13 ...
# Summarise:
summarise(top7)
## ramka danych z zerową liczbą kolumn oraz 1 wierszem
# Color-coded chart by country
ggplot(data = top7, aes(x = date, y = cum_cases, col = country)) +
geom_point() +
scale_y_log10() +
labs(title = "Color-coded chart by country", x = "Date", y = "Cum_cases") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Transformation introduced infinite values in continuous y-axis
# Infection trend over time
ggplot(data = top7, aes(x = as.Date(date), y = cum_cases, color = country)) +
geom_line() +
labs(title = "Infection trend over time", x = "Date", y = "Cum_sum")
# Comparison of the number of cases on a logarithmic scale, now we can better capture growth proportions.
ggplot(data = top7, aes(x = as.Date(date), y = cum_cases, color = country)) +
geom_line() +
scale_y_log10() +
labs(title = "Comparison of the number of cases on a logarithmic scale", x = "Date", y = "Cum_sum (log)")
## Warning: Transformation introduced infinite values in continuous y-axis
# Number of cases on selected days: start, middle, end
selected_dates <- top7 %>% filter(date %in% c("2020-02-18", "2020-03-01", "2020-03-15"))
ggplot(data = selected_dates, aes(x = country, y = cum_cases, fill = country)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~date) +
labs(title = "Number of cases on selected days", x = "Country", y = "Number of Cases") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Percentage of countries in the global number of cases
total_cases_by_country <- top7 %>%
group_by(country) %>%
summarise(total_cases = max(cum_cases, na.rm = TRUE))
total_cases_by_country <- total_cases_by_country %>%
mutate(percentage = (total_cases / sum(total_cases)) * 100)
ggplot(data = total_cases_by_country, aes(x = reorder(country, -percentage), y = percentage, fill = country)) +
geom_bar(stat = "identity") +
labs(title = "Percentage of countries in the global number of cases", x = "Country", y = "Percentage")
#Heatmap
ggplot(data = top7, aes(x = as.Date(date), y = country, fill = cum_cases)) +
geom_tile() +
labs(title = "Heatmap of cases over time", x = "Date", y = "Country") +
scale_fill_gradient(low = "white", high = "red")
library(dplyr)
## Warning: pakiet 'dplyr' został zbudowany w wersji R 4.3.2
##
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
# This dataset presents countries, theis provinces , date and cases
data_c <- read_csv("C:/Users/Martynaa/Desktop/portfolio/analizy_R/cases_by_country.csv")
## Rows: 13272 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): country, province
## dbl (2): cases, cum_cases
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data_c)
## # A tibble: 6 × 5
## country province date cases cum_cases
## <chr> <chr> <date> <dbl> <dbl>
## 1 Afghanistan <NA> 2020-01-22 0 0
## 2 Albania <NA> 2020-01-22 0 0
## 3 Algeria <NA> 2020-01-22 0 0
## 4 Andorra <NA> 2020-01-22 0 0
## 5 Antigua and Barbuda <NA> 2020-01-22 0 0
## 6 Argentina <NA> 2020-01-22 0 0
# Structure
str(data_c)
## spc_tbl_ [13,272 × 5] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ country : chr [1:13272] "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ province : chr [1:13272] NA NA NA NA ...
## $ date : Date[1:13272], format: "2020-01-22" "2020-01-22" ...
## $ cases : num [1:13272] 0 0 0 0 0 0 0 0 0 0 ...
## $ cum_cases: num [1:13272] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. country = col_character(),
## .. province = col_character(),
## .. date = col_date(format = ""),
## .. cases = col_double(),
## .. cum_cases = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Summary
summary(data_c)
## country province date cases
## Length:13272 Length:13272 Min. :2020-01-22 Min. : -20.000
## Class :character Class :character 1st Qu.:2020-02-04 1st Qu.: 0.000
## Mode :character Mode :character Median :2020-02-18 Median : 0.000
## Mean :2020-02-18 Mean : 8.747
## 3rd Qu.:2020-03-03 3rd Qu.: 0.000
## Max. :2020-03-17 Max. :5198.000
## cum_cases
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 2.0
## Mean : 182.5
## 3rd Qu.: 15.0
## Max. :31506.0
# top 9
top<-data_c %>% group_by(country) %>% summarise(country=n(), max=max(cum_cases))
top
## # A tibble: 151 × 2
## country max
## <int> <dbl>
## 1 56 22
## 2 56 55
## 3 56 60
## 4 56 39
## 5 56 1
## 6 56 68
## 7 56 78
## 8 56 3
## 9 504 452
## 10 56 1332
## # ℹ 141 more rows
top9<- top %>% arrange(desc(max)) %>% slice_head(n=9) %>% select(country, max)
top9
## # A tibble: 9 × 2
## country max
## <int> <dbl>
## 1 56 31506
## 2 56 16169
## 3 56 11748
## 4 56 9257
## 5 56 8320
## 6 448 7699
## 7 3192 6421
## 8 56 2700
## 9 224 1960
# As we can see top 7 caoutries with a highiest rate of covid cases is in the same countries as in a previous dataset
#Interactive chart for USA provinces
dane <-data_c %>% filter(date > as.Date("2020-02-20") & country=="US")
dane
## # A tibble: 1,482 × 5
## country province date cases cum_cases
## <chr> <chr> <date> <dbl> <dbl>
## 1 US Alabama 2020-02-21 0 13
## 2 US Alaska 2020-02-21 0 13
## 3 US Arizona 2020-02-21 0 13
## 4 US Arkansas 2020-02-21 0 13
## 5 US California 2020-02-21 2 15
## 6 US Colorado 2020-02-21 0 15
## 7 US Connecticut 2020-02-21 0 15
## 8 US Delaware 2020-02-21 0 15
## 9 US Diamond Princess 2020-02-21 0 15
## 10 US District of Columbia 2020-02-21 0 15
## # ℹ 1,472 more rows
plot_ly(data=dane, x=~date, y=~cases , color = ~province, colors = RColorBrewer::brewer.pal(8, "Set2"), type="scatter", mode="markers")